library(readr)
library(dplyr)
library(tidyr)

#1) load character_words
#2) change lemmas for patient words, labelling as "was_lemma"
#second step will help us latter distinguish between different uses of words
#3) add decade data for sampling purposes

character_gender <- read_csv("/Users/cheng/OneDrive/Desktop/character/character/1_character_gender/1_output.csv" )%>%
  filter(!character_gender %in% "U", !predicted_author_gender %in% c("U", 0, "#N/A")) %>%
  filter(publ_date >= 1850) %>%
  mutate(lemma = ifelse(relation %in% "patient", paste("was_",lemma), lemma))%>% 
  mutate(pub_decade = (publ_date - publ_date %% 10))

#4) calculate relative frequency of anatomical description, relativizing for total number of words used in characterization
#for a given text
#5) filter for anatomical words, because our classification just focuses on that subset of words
body_relfreq <- character_gender %>%
  group_by(filename) %>%
  mutate(characterWRDS = sum(count)) %>%
  mutate(relFreq = count/characterWRDS)
  filter(characterization %in% "anatomical")
  
# Now we want to grab samples from each decade. 
# All of the characters we scraped back in python have at least 40, so our samples are of characters of similar "weight"

# For the size of our corpus, we'll be getting 450 characters from each decade (225 men, 225 women), and their word data
# We also want to look at more commonly used words between each character. For one, to make sure we're comparing apples to apples.
# Second, it'll let us see gendered differences in more commonly used words

for (f in 1:15){
  #set output filename
  spread_name <- paste("/Users/cheng/OneDrive/Desktop/character/character/3_prepare_samples/output_sample_data/spread",
                       f,".RData", sep="")
  
  #6) Sample characters
  file_name <- body_relfreq %>%
    select(filename, name, character_gender) %>%
    distinct()
   
  #sample 225 men and women from each decade
  m_sample_name <- file_name %>%
    filter(character_gender %in% "M") %>%
    group_by(decade) %>%
    sample_n(225)
  
  f_sample_name <- file_name %>%
    filter(character_gender %in% "F") %>%
    filter(characterization %in% "anatomical") %>%
    group_by(decade) %>%
    sample_n(225)
  
  #take only the anatomical words
  m_sample <- body_relfreq %>%
    filter(name %in% m_sample_name$name & filename %in% m_sample_name$filename) %>%
    filter(characterization %in% "anatomical")
  
  f_sample <- body_relfreq %>%
    filter(name %in% f_sample_name$name & filename %in% f_sample_name$filename) %>%
    filter(characterization %in% "anatomical")
  
  #combine dataframes
  #7) take only top 10% of words used in anatomical description
  combine_sample <- bind_rows(m_sample, f_sample) %>% 
    group_by(pub_decade) %>%
    mutate(quantile = quantile(relFreq, .90)) %>%
    filter(relFreq >= quantile)
  
  #select columns needed for classification
  final_sample <- combine_sample %>%
    select(pub_decade, predicted_author_gender, character_gender, lemma, count, characterWRDS, relFreq) %>%
    rename(predicted_character_gender = character_gender)
  
  #8) spread data for classification
  final_spread <- final_sample %>% 
    select(pub_decade, predicted_author_gender, predicted_character_gender, lemma, relFreq) %>%
    spread(lemma, relFreq) %>% 
    replace(.,is.na(.), 0)
  
  save(final_spread, file = spread_name)
}
